PART 1

Note: throughout the course, we will mix between the base R notation and tidyverse notation because it is not always the easiest to use the tidyverse way on these genetic data.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(readr)

SNPs and alleles

  1. Read data from file data_a_intro_R.dat and save it to obj:
obj <- read_table2("data_a_intro_R.dat")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   ID = col_character()
## )
## See spec(...) for full column specifications.
  1. Look at the data: check the class, dimensions and print the first part of the data:
class(obj)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"
obj
  1. Using the data
  • Accessing the column named cc.status, using the $ sign (0 codes for control and 1 for case)
cc.status <- obj$cc.status
class(cc.status)
## [1] "numeric"
mean(cc.status)
## [1] 0.5555556
summary(cc.status)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.5556  1.0000  1.0000
table(cc.status)
## cc.status
##   0   1 
## 400 500
  • Accessing the first SNP
snp1 <- obj %>%
  select(snp_1.1, snp_1.2)
snp1
  • Tabulating the first SNP
table(snp1$snp_1.1)
## 
##   1   2 
## 101 799
table(snp1$snp_1.2)
## 
##   1   2 
## 108 792
table(snp1)
##        snp_1.2
## snp_1.1   1   2
##       1  15  86
##       2  93 706
  • Divide by number of indivdiuals (function nrow will count all the rows, i.e., individuals) and check whether it sums up to 100%
table(snp1)/nrow(snp1)
##        snp_1.2
## snp_1.1          1          2
##       1 0.01666667 0.09555556
##       2 0.10333333 0.78444444
sum(table(snp1)/nrow(snp1))
## [1] 1
  • Rounding to two decimal points - still sums up to 100%!
round(table(snp1)/nrow(snp1), 2)
##        snp_1.2
## snp_1.1    1    2
##       1 0.02 0.10
##       2 0.10 0.78
sum(round(table(snp1)/nrow(snp1), 2))
## [1] 1
  • Tabulate SNP8
snp8 <- obj %>%
  select(snp_8.1, snp_8.2)
snp8
table(snp8$snp_8.1)
## 
##  -9   1   2 
##  10 155 669
table(snp8$snp_8.2)
## 
##  -9   1   2 
##  10 182 642
table(snp8)
##        snp_8.2
## snp_8.1  -9   1   2
##      -9  10   0   0
##      1    0  32 123
##      2    0 150 519
  • -9 means missing. Replace -9 with NA (not available) in the entire dataset.
obj[ obj == -9 ] <- NA
snp8 <- obj %>%
  select(snp_8.1, snp_8.2)

Exercise 1

Fill the space between apostrophes with your code and click the green arrow to check how it evaluates.

1.1 Tabulate SNP8 again.
1.2 What is the distribution of the alleles?
1.3 Why does it not sum to 100%? Hint: ?table

Missing data, apply, which, names, any, remove rows and columns from object

  1. Set the argument useNA = "ifany" to make the distribution of alleles in SNP8 sum up to 100%
table( snp8, useNA = "ifany" )/nrow( snp8 )
##        snp_8.2
## snp_8.1          1          2       <NA>
##    1    0.03555556 0.13666667 0.00000000
##    2    0.16666667 0.57666667 0.00000000
##    <NA> 0.00000000 0.00000000 0.08444444
sum( table( snp8, useNA = "ifany" )/nrow( snp8 ) )
## [1] 1
  1. Find those columns in obj that contain missing data
na.obj <- is.na(obj)
head(na.obj)
##         ID cc.status snp_1.1 snp_1.2 snp_2.1 snp_2.2 snp_3.1 snp_3.2 snp_4.1
## [1,] FALSE     FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE    TRUE
## [2,] FALSE     FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE    TRUE
## [3,] FALSE     FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE
## [4,] FALSE     FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE
## [5,] FALSE     FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE    TRUE
## [6,] FALSE     FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE
##      snp_4.2 snp_5.1 snp_5.2 snp_6.1 snp_6.2 snp_7.1 snp_7.2 snp_8.1 snp_8.2
## [1,]    TRUE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE
## [2,]    TRUE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE    TRUE    TRUE
## [3,]   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE
## [4,]   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE
## [5,]    TRUE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE    TRUE    TRUE
## [6,]   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE   FALSE
##      snp_9.1 snp_9.2 snp_10.1 snp_10.2 snp_11.1 snp_11.2 snp_12.1 snp_12.2
## [1,]   FALSE   FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [2,]   FALSE   FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [3,]   FALSE   FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [4,]   FALSE   FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [5,]   FALSE   FALSE    FALSE    FALSE    FALSE    FALSE     TRUE     TRUE
## [6,]   FALSE   FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
##      snp_13.1 snp_13.2 snp_14.1 snp_14.2 snp_15.1 snp_15.2 snp_16.1 snp_16.2
## [1,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [2,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [3,]    FALSE    FALSE     TRUE     TRUE    FALSE    FALSE    FALSE    FALSE
## [4,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [5,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [6,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
##      snp_17.1 snp_17.2 snp_18.1 snp_18.2 snp_19.1 snp_19.2 snp_20.1 snp_20.2
## [1,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [2,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [3,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [4,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [5,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [6,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
##      snp_21.1 snp_21.2 snp_22.1 snp_22.2 snp_23.1 snp_23.2
## [1,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [2,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [3,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [4,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [5,]    FALSE    FALSE    FALSE    FALSE    FALSE    FALSE
## [6,]    FALSE    FALSE     TRUE     TRUE    FALSE    FALSE
head(na.obj)*1
##      ID cc.status snp_1.1 snp_1.2 snp_2.1 snp_2.2 snp_3.1 snp_3.2 snp_4.1
## [1,]  0         0       0       0       0       0       0       0       1
## [2,]  0         0       0       0       0       0       0       0       1
## [3,]  0         0       0       0       0       0       0       0       0
## [4,]  0         0       0       0       0       0       0       0       0
## [5,]  0         0       0       0       0       0       0       0       1
## [6,]  0         0       0       0       0       0       0       0       0
##      snp_4.2 snp_5.1 snp_5.2 snp_6.1 snp_6.2 snp_7.1 snp_7.2 snp_8.1 snp_8.2
## [1,]       1       0       0       0       0       0       0       0       0
## [2,]       1       0       0       0       0       0       0       1       1
## [3,]       0       0       0       0       0       0       0       0       0
## [4,]       0       0       0       0       0       0       0       0       0
## [5,]       1       0       0       0       0       0       0       1       1
## [6,]       0       0       0       0       0       0       0       0       0
##      snp_9.1 snp_9.2 snp_10.1 snp_10.2 snp_11.1 snp_11.2 snp_12.1 snp_12.2
## [1,]       0       0        0        0        0        0        0        0
## [2,]       0       0        0        0        0        0        0        0
## [3,]       0       0        0        0        0        0        0        0
## [4,]       0       0        0        0        0        0        0        0
## [5,]       0       0        0        0        0        0        1        1
## [6,]       0       0        0        0        0        0        0        0
##      snp_13.1 snp_13.2 snp_14.1 snp_14.2 snp_15.1 snp_15.2 snp_16.1 snp_16.2
## [1,]        0        0        0        0        0        0        0        0
## [2,]        0        0        0        0        0        0        0        0
## [3,]        0        0        1        1        0        0        0        0
## [4,]        0        0        0        0        0        0        0        0
## [5,]        0        0        0        0        0        0        0        0
## [6,]        0        0        0        0        0        0        0        0
##      snp_17.1 snp_17.2 snp_18.1 snp_18.2 snp_19.1 snp_19.2 snp_20.1 snp_20.2
## [1,]        0        0        0        0        0        0        0        0
## [2,]        0        0        0        0        0        0        0        0
## [3,]        0        0        0        0        0        0        0        0
## [4,]        0        0        0        0        0        0        0        0
## [5,]        0        0        0        0        0        0        0        0
## [6,]        0        0        0        0        0        0        0        0
##      snp_21.1 snp_21.2 snp_22.1 snp_22.2 snp_23.1 snp_23.2
## [1,]        0        0        0        0        0        0
## [2,]        0        0        0        0        0        0
## [3,]        0        0        0        0        0        0
## [4,]        0        0        0        0        0        0
## [5,]        0        0        0        0        0        0
## [6,]        0        0        1        1        0        0
  • Function is.na returns TRUE or FALSE for each value, but we can translate it to 0’s and 1’s, which is easier to handle when calculating some properties of each SNP.
  1. Identify proportion of missing data for each SNP
na.snp <- apply(na.obj, 2, mean)
na.snp
##         ID  cc.status    snp_1.1    snp_1.2    snp_2.1    snp_2.2    snp_3.1 
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 
##    snp_3.2    snp_4.1    snp_4.2    snp_5.1    snp_5.2    snp_6.1    snp_6.2 
## 0.00000000 0.51000000 0.51000000 0.00000000 0.00000000 0.00000000 0.00000000 
##    snp_7.1    snp_7.2    snp_8.1    snp_8.2    snp_9.1    snp_9.2   snp_10.1 
## 0.00000000 0.00000000 0.08444444 0.08444444 0.00000000 0.00000000 0.00000000 
##   snp_10.2   snp_11.1   snp_11.2   snp_12.1   snp_12.2   snp_13.1   snp_13.2 
## 0.00000000 0.00000000 0.00000000 0.07888889 0.07888889 0.00000000 0.00000000 
##   snp_14.1   snp_14.2   snp_15.1   snp_15.2   snp_16.1   snp_16.2   snp_17.1 
## 0.06000000 0.06000000 0.00000000 0.00000000 0.01000000 0.01000000 0.00000000 
##   snp_17.2   snp_18.1   snp_18.2   snp_19.1   snp_19.2   snp_20.1   snp_20.2 
## 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 
##   snp_21.1   snp_21.2   snp_22.1   snp_22.2   snp_23.1   snp_23.2 
## 0.07111111 0.07111111 0.06111111 0.06111111 0.08444444 0.08444444
  • Which SNPs have more than 7% missing?
na.snp.07 <- which(na.snp > 0.07)
na.snp.07
##  snp_4.1  snp_4.2  snp_8.1  snp_8.2 snp_12.1 snp_12.2 snp_21.1 snp_21.2 
##        9       10       17       18       25       26       43       44 
## snp_23.1 snp_23.2 
##       47       48
names(na.snp.07)
##  [1] "snp_4.1"  "snp_4.2"  "snp_8.1"  "snp_8.2"  "snp_12.1" "snp_12.2"
##  [7] "snp_21.1" "snp_21.2" "snp_23.1" "snp_23.2"
  • Look at first 10 rows of SNPs with more than 7% missing
obj %>% select(!!names(na.snp.07))
  • Which SNPs have any missing data?
apply(na.obj, 2, any)
##        ID cc.status   snp_1.1   snp_1.2   snp_2.1   snp_2.2   snp_3.1   snp_3.2 
##     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##   snp_4.1   snp_4.2   snp_5.1   snp_5.2   snp_6.1   snp_6.2   snp_7.1   snp_7.2 
##      TRUE      TRUE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##   snp_8.1   snp_8.2   snp_9.1   snp_9.2  snp_10.1  snp_10.2  snp_11.1  snp_11.2 
##      TRUE      TRUE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##  snp_12.1  snp_12.2  snp_13.1  snp_13.2  snp_14.1  snp_14.2  snp_15.1  snp_15.2 
##      TRUE      TRUE     FALSE     FALSE      TRUE      TRUE     FALSE     FALSE 
##  snp_16.1  snp_16.2  snp_17.1  snp_17.2  snp_18.1  snp_18.2  snp_19.1  snp_19.2 
##      TRUE      TRUE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##  snp_20.1  snp_20.2  snp_21.1  snp_21.2  snp_22.1  snp_22.2  snp_23.1  snp_23.2 
##     FALSE     FALSE      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE

We can do the same in two different ways:

  • directly on the original data set:
apply( is.na( obj ), 2, any)
##        ID cc.status   snp_1.1   snp_1.2   snp_2.1   snp_2.2   snp_3.1   snp_3.2 
##     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##   snp_4.1   snp_4.2   snp_5.1   snp_5.2   snp_6.1   snp_6.2   snp_7.1   snp_7.2 
##      TRUE      TRUE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##   snp_8.1   snp_8.2   snp_9.1   snp_9.2  snp_10.1  snp_10.2  snp_11.1  snp_11.2 
##      TRUE      TRUE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##  snp_12.1  snp_12.2  snp_13.1  snp_13.2  snp_14.1  snp_14.2  snp_15.1  snp_15.2 
##      TRUE      TRUE     FALSE     FALSE      TRUE      TRUE     FALSE     FALSE 
##  snp_16.1  snp_16.2  snp_17.1  snp_17.2  snp_18.1  snp_18.2  snp_19.1  snp_19.2 
##      TRUE      TRUE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##  snp_20.1  snp_20.2  snp_21.1  snp_21.2  snp_22.1  snp_22.2  snp_23.1  snp_23.2 
##     FALSE     FALSE      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE
  • and an advanced version, using a self-written function (useful if you must create your own code that is executed on all columns):
apply( obj, 2, function(x){
  any( is.na(x) )
} )
##        ID cc.status   snp_1.1   snp_1.2   snp_2.1   snp_2.2   snp_3.1   snp_3.2 
##     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##   snp_4.1   snp_4.2   snp_5.1   snp_5.2   snp_6.1   snp_6.2   snp_7.1   snp_7.2 
##      TRUE      TRUE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##   snp_8.1   snp_8.2   snp_9.1   snp_9.2  snp_10.1  snp_10.2  snp_11.1  snp_11.2 
##      TRUE      TRUE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##  snp_12.1  snp_12.2  snp_13.1  snp_13.2  snp_14.1  snp_14.2  snp_15.1  snp_15.2 
##      TRUE      TRUE     FALSE     FALSE      TRUE      TRUE     FALSE     FALSE 
##  snp_16.1  snp_16.2  snp_17.1  snp_17.2  snp_18.1  snp_18.2  snp_19.1  snp_19.2 
##      TRUE      TRUE     FALSE     FALSE     FALSE     FALSE     FALSE     FALSE 
##  snp_20.1  snp_20.2  snp_21.1  snp_21.2  snp_22.1  snp_22.2  snp_23.1  snp_23.2 
##     FALSE     FALSE      TRUE      TRUE      TRUE      TRUE      TRUE      TRUE
  • Remove SNPs with more than 7% missing:
obj <- obj %>%
  select(-!!names(na.snp.07))
obj

Exercise 2

2.1 Which individuals have any missing data? (Use apply and obj, not na.obj)

which( apply( obj, 1, function(x){
    ...
} ) )
which( apply( obj, 1, function(x){
    any( is.na( x ) )
} ) )
2.2 Keep individuals with less than 10% missing. Hint: First identify the rows with more than 10% missing, and then remove those.
more10.missing <- 
more10.missing <- which( apply( obj, 1, function(x){
    sum( is.na( x ) ) > length( x )/10
} ) )
obj.less10missing <- ...
obj.less10missing
more10.missing <- which( apply( obj, 1, function(x){
    sum( is.na( x ) ) > length( x )/10
} ) )
obj.less10missing <- obj[ -more10.missing, ]
obj.less10missing

Merging matrices

  1. Start with the original dataset
obj <- read_table2("data_a_intro_R.dat")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   ID = col_character()
## )
## See spec(...) for full column specifications.
  1. data_b_intro_R.dat contains data on SNP24. Combine the previous dataset and this new SNP (bind_cols combines columnwise).
snp24 <- read_table2("data_b_intro_R.dat")
## Parsed with column specification:
## cols(
##   snp_24.1 = col_double(),
##   snp_24.2 = col_double()
## )
snp24
obj <- obj %>%
  bind_cols(snp24)
obj

Exercise 3

3.1 Add extra individuals (rows!) from data_c_intro_R.dat (with function bind_rows).
extra.indiv <- read_table2("data_c_intro_R.dat")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   ID = col_character()
## )
## See spec(...) for full column specifications.
extra.indiv
obj.extra.indiv <- obj %>% bind_rows(extra.indiv)
obj
obj.extra.indiv

Save and load objects

  • write_table writes a data.frame, tibble, or matrix into a text-formatted file.
  • save saves objects (not only matrices) to a binary file, readable by R with the load function.
  • ls function shows what objects are currently loaded into memory.
write_delim(obj, path = "data_prep_R.dat", col_names = TRUE)

save(obj, file = "obj.RData")
rm(obj)
ls()
##  [1] "cc.status"       "dput_to_string"  "extra.indiv"     "na.obj"         
##  [5] "na.snp"          "na.snp.07"       "obj.extra.indiv" "snp1"           
##  [9] "snp24"           "snp8"
load("obj.RData")
ls()
##  [1] "cc.status"       "dput_to_string"  "extra.indiv"     "na.obj"         
##  [5] "na.snp"          "na.snp.07"       "obj"             "obj.extra.indiv"
##  [9] "snp1"            "snp24"           "snp8"

Packages

One of the strengths of R is the vast number of packages that can be installed at no cost.

  1. Installing package(s)
install.packages( "Haplin" )
  1. Loading a package
library( Haplin )
## Registered S3 methods overwritten by 'ffbase':
##   method   from
##   [.ff     ff  
##   [.ffdf   ff  
##   [<-.ff   ff  
##   [<-.ffdf ff
  1. Help function for entire package
help( package = "Haplin" )

Save and load workspace.

This saves the entire content of the memory, so that one can get back to work where they’d finished. Note that this does not re-load libraries, so one needs to run all the necessary library commands before or after loading a new workspace.

save.image( "genestat.RData" )

load( "genestat.RData" )

PART 2

We are now starting to work on real genotype data!

PLINK (Will get back to this in other lectures)

Input files: data.bed, data.bim, data.fam. Not available to you! plink --bfile data --alleleACGT --recode --out data The output is found in pres.ped and pres.map. Available to you!

The files data.ped and data.map contain information about the genotype and phenotype, and about the SNPs, respectively, for a dataset with families (mother, father and a child), where the child had an oral cleft.

  1. Show the first 10 lines of data.map
read_table2("data.map", col_names = FALSE)
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X2 = col_character(),
##   X3 = col_double()
## )
  1. Show the first 10 lines of data.ped
read_table2("data.ped", col_names = FALSE)
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_double(),
##   X5 = col_double(),
##   X6 = col_double()
## )
## See spec(...) for full column specifications.

PED-format:

FAMILY_ID ID_CHILD ID_DAD ID_MOM SEX CC GENOTYPES
1 1_1 1_3 1_2 2 1 G G T A A T C G C G G A …
1 1_2 0 0 2 0 G G T A A T G G G G A A …
1 1_3 0 0 1 1 G G T A A T C G C C G A …
2 2_1 2_3 2_2 1 0 G G T A A T G G C G G A …
2 2_2 0 0 2 0 G G A A T T G G G G A A …
2 2_3 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 …

Reading a PED file with snpStats

  1. Install snpStats package. Note: this package is stored as a part of Bioconductor – a large R package collection for analysis of biological and bioinformatics data.
install.packages("BiocManager")
BiocManager::install("snpStats", update = FALSE)
  1. Load snpStats package
library(snpStats)
## Loading required package: survival
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
  1. Read ped-file into R
raw.all <- read.pedfile( file = "data.ped", snps = "data.map", which = 2 )

Look at the data

  1. Structure of the data
str( raw.all )
## List of 3
##  $ genotypes:Formal class 'SnpMatrix' [package "snpStats"] with 1 slot
##   .. ..@ .Data: raw [1:1659, 1:429] 01 01 01 01 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:1659] "1_1" "1_2" "1_3" "2_1" ...
##   .. .. .. ..$ : chr [1:429] "rs1" "rs3" "rs5" "rs6" ...
##  $ fam      :'data.frame':   1659 obs. of  6 variables:
##   ..$ pedigree: Factor w/ 550 levels "1","10","100",..: 1 1 1 112 112 112 223 223 223 334 ...
##   ..$ member  : Factor w/ 1659 levels "1_1","1_2","1_3",..: 1 2 3 338 339 340 673 674 675 1007 ...
##   ..$ father  : Factor w/ 550 levels "1_3","10_3","100_3",..: 1 NA NA 112 NA NA 223 NA NA 334 ...
##   ..$ mother  : Factor w/ 550 levels "1_2","10_2","100_2",..: 1 NA NA 112 NA NA 223 NA NA 334 ...
##   ..$ sex     : num [1:1659] 2 2 1 1 2 1 1 2 1 1 ...
##   ..$ affected: num [1:1659] 1 NA 1 NA NA 1 NA 1 NA 1 ...
##  $ map      :'data.frame':   429 obs. of  5 variables:
##   ..$ V1       : int [1:429] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ snp.names: Factor w/ 429 levels "rs1","rs10","rs100",..: 1 136 278 341 407 413 421 2 11 20 ...
##   ..$ V3       : int [1:429] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ allele.1 : chr [1:429] "G" "T" "A" "C" ...
##   ..$ allele.2 : chr [1:429] "C" "A" "T" "G" ...
  1. Renaming columns to match what we have in the .map file
colnames( raw.all$map )[ c( 1,3 ) ] <- c( "chromosome", "position" )
head( raw.all$map )
  1. Renaming rows to match what we have in the .map file
rownames( raw.all$map )
##   [1] "1"   "2"   "3"   "4"   "5"   "6"   "7"   "8"   "9"   "10"  "11"  "12" 
##  [13] "13"  "14"  "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22"  "23"  "24" 
##  [25] "25"  "26"  "27"  "28"  "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36" 
##  [37] "37"  "38"  "39"  "40"  "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48" 
##  [49] "49"  "50"  "51"  "52"  "53"  "54"  "55"  "56"  "57"  "58"  "59"  "60" 
##  [61] "61"  "62"  "63"  "64"  "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72" 
##  [73] "73"  "74"  "75"  "76"  "77"  "78"  "79"  "80"  "81"  "82"  "83"  "84" 
##  [85] "85"  "86"  "87"  "88"  "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96" 
##  [97] "97"  "98"  "99"  "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [145] "145" "146" "147" "148" "149" "150" "151" "152" "153" "154" "155" "156"
## [157] "157" "158" "159" "160" "161" "162" "163" "164" "165" "166" "167" "168"
## [169] "169" "170" "171" "172" "173" "174" "175" "176" "177" "178" "179" "180"
## [181] "181" "182" "183" "184" "185" "186" "187" "188" "189" "190" "191" "192"
## [193] "193" "194" "195" "196" "197" "198" "199" "200" "201" "202" "203" "204"
## [205] "205" "206" "207" "208" "209" "210" "211" "212" "213" "214" "215" "216"
## [217] "217" "218" "219" "220" "221" "222" "223" "224" "225" "226" "227" "228"
## [229] "229" "230" "231" "232" "233" "234" "235" "236" "237" "238" "239" "240"
## [241] "241" "242" "243" "244" "245" "246" "247" "248" "249" "250" "251" "252"
## [253] "253" "254" "255" "256" "257" "258" "259" "260" "261" "262" "263" "264"
## [265] "265" "266" "267" "268" "269" "270" "271" "272" "273" "274" "275" "276"
## [277] "277" "278" "279" "280" "281" "282" "283" "284" "285" "286" "287" "288"
## [289] "289" "290" "291" "292" "293" "294" "295" "296" "297" "298" "299" "300"
## [301] "301" "302" "303" "304" "305" "306" "307" "308" "309" "310" "311" "312"
## [313] "313" "314" "315" "316" "317" "318" "319" "320" "321" "322" "323" "324"
## [325] "325" "326" "327" "328" "329" "330" "331" "332" "333" "334" "335" "336"
## [337] "337" "338" "339" "340" "341" "342" "343" "344" "345" "346" "347" "348"
## [349] "349" "350" "351" "352" "353" "354" "355" "356" "357" "358" "359" "360"
## [361] "361" "362" "363" "364" "365" "366" "367" "368" "369" "370" "371" "372"
## [373] "373" "374" "375" "376" "377" "378" "379" "380" "381" "382" "383" "384"
## [385] "385" "386" "387" "388" "389" "390" "391" "392" "393" "394" "395" "396"
## [397] "397" "398" "399" "400" "401" "402" "403" "404" "405" "406" "407" "408"
## [409] "409" "410" "411" "412" "413" "414" "415" "416" "417" "418" "419" "420"
## [421] "421" "422" "423" "424" "425" "426" "427" "428" "429"
rownames( raw.all$map ) <- raw.all$map$snp.names
head( raw.all$map )
  1. Look at structure again
str( raw.all )
## List of 3
##  $ genotypes:Formal class 'SnpMatrix' [package "snpStats"] with 1 slot
##   .. ..@ .Data: raw [1:1659, 1:429] 01 01 01 01 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:1659] "1_1" "1_2" "1_3" "2_1" ...
##   .. .. .. ..$ : chr [1:429] "rs1" "rs3" "rs5" "rs6" ...
##  $ fam      :'data.frame':   1659 obs. of  6 variables:
##   ..$ pedigree: Factor w/ 550 levels "1","10","100",..: 1 1 1 112 112 112 223 223 223 334 ...
##   ..$ member  : Factor w/ 1659 levels "1_1","1_2","1_3",..: 1 2 3 338 339 340 673 674 675 1007 ...
##   ..$ father  : Factor w/ 550 levels "1_3","10_3","100_3",..: 1 NA NA 112 NA NA 223 NA NA 334 ...
##   ..$ mother  : Factor w/ 550 levels "1_2","10_2","100_2",..: 1 NA NA 112 NA NA 223 NA NA 334 ...
##   ..$ sex     : num [1:1659] 2 2 1 1 2 1 1 2 1 1 ...
##   ..$ affected: num [1:1659] 1 NA 1 NA NA 1 NA 1 NA 1 ...
##  $ map      :'data.frame':   429 obs. of  5 variables:
##   ..$ chromosome: int [1:429] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ snp.names : Factor w/ 429 levels "rs1","rs10","rs100",..: 1 136 278 341 407 413 421 2 11 20 ...
##   ..$ position  : int [1:429] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ allele.1  : chr [1:429] "G" "T" "A" "C" ...
##   ..$ allele.2  : chr [1:429] "C" "A" "T" "G" ...
  1. Phenotype data (fam) contains information about individuals: ID, sex, case or control. NOTE: snpStats codes 0 as NA in case/control status column, so we need to fix it.
head( raw.all$fam )
raw.all$fam$affected[ is.na( raw.all$fam$affected ) ] <- 0
head( raw.all$fam )
  1. Check the number of individuals
nrow( raw.all$fam )
## [1] 1659
  1. Tabulate the individuals’ gender
table( raw.all$fam$sex )
## 
##   1   2 
## 794 865
  1. Get the chromosome names
unique( raw.all$map$chromosome )
## [1]  1 14 23

Exercise 4

4.1 Tabulate the chromosome names.

table( raw.all$map$chromosome )

4.2 What are the dimensions of the genotype data? Are the individuals represented by columns or rows? (Hint: check the structure of raw.all)

dim( raw.all$genotypes )
str( raw.all$genotypes )

4.3 Check how many families are in the data?

Hint: tabulate the pedigree column of the family information

table( raw.all$fam$pedigree )
length( table( raw.all$fam$pedigree ) )

4.4 How many of the individuals were affected?

table( raw.all$fam$affected )

Extract children, chromosome and SNPs

  1. Find all the individuals with an ID ending with _1 (these are children).
head(raw.all$fam$member, 21)
##  [1] 1_1 1_2 1_3 2_1 2_2 2_3 3_1 3_2 3_3 4_1 4_2 4_3 5_1 5_2 5_3 6_1 6_2 6_3 7_1
## [20] 7_2 7_3
## 1659 Levels: 1_1 1_2 1_3 10_1 10_2 10_3 100_1 100_2 100_3 101_1 101_2 ... 99_3
children <- grep(pattern = "_1", x = raw.all$fam$member)
head(children, 30)
##  [1]  1  4  7 10 13 16 19 22 25 28 31 34 37 40 43 46 49 52 55 58 61 64 67 70 73
## [26] 76 79 82 85 88
head(raw.all$fam$member[ children ], 30)
##  [1] 1_1  2_1  3_1  4_1  5_1  6_1  7_1  8_1  9_1  10_1 11_1 12_1 13_1 14_1 15_1
## [16] 16_1 17_1 18_1 19_1 20_1 21_1 22_1 23_1 24_1 25_1 26_1 27_1 28_1 29_1 30_1
## 1659 Levels: 1_1 1_2 1_3 10_1 10_2 10_3 100_1 100_2 100_3 101_1 101_2 ... 99_3
  1. Make a new object with children only
raw.child.gen <- raw.all$genotypes[ children, ]
str(raw.child.gen)
## Formal class 'SnpMatrix' [package "snpStats"] with 1 slot
##   ..@ .Data: raw [1:550, 1:429] 01 01 01 01 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:550] "1_1" "2_1" "3_1" "4_1" ...
##   .. .. ..$ : chr [1:429] "rs1" "rs3" "rs5" "rs6" ...
raw.child.fam <- raw.all$fam[ children, ]
raw.child <- list( genotypes = raw.child.gen, 
                  fam = raw.child.fam, 
                  map = raw.all$map )
str(raw.child)
## List of 3
##  $ genotypes:Formal class 'SnpMatrix' [package "snpStats"] with 1 slot
##   .. ..@ .Data: raw [1:550, 1:429] 01 01 01 01 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:550] "1_1" "2_1" "3_1" "4_1" ...
##   .. .. .. ..$ : chr [1:429] "rs1" "rs3" "rs5" "rs6" ...
##  $ fam      :'data.frame':   550 obs. of  6 variables:
##   ..$ pedigree: Factor w/ 550 levels "1","10","100",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ member  : Factor w/ 1659 levels "1_1","1_2","1_3",..: 1 338 673 1007 1341 1527 1560 1593 1626 4 ...
##   ..$ father  : Factor w/ 550 levels "1_3","10_3","100_3",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ mother  : Factor w/ 550 levels "1_2","10_2","100_2",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ sex     : num [1:550] 2 1 1 1 2 1 2 2 2 1 ...
##   ..$ affected: num [1:550] 1 0 0 1 1 1 1 0 1 1 ...
##  $ map      :'data.frame':   429 obs. of  5 variables:
##   ..$ chromosome: int [1:429] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ snp.names : Factor w/ 429 levels "rs1","rs10","rs100",..: 1 136 278 341 407 413 421 2 11 20 ...
##   ..$ position  : int [1:429] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ allele.1  : chr [1:429] "G" "T" "A" "C" ...
##   ..$ allele.2  : chr [1:429] "C" "A" "T" "G" ...
  1. Make a new object with chromosome 1 only
  • check which SNPs are in chromosome 1
chr1.snps <- raw.all$map$snp.names[ raw.all$map$chromosome == "1" ]
chr1.snps
##   [1] rs1   rs3   rs5   rs6   rs7   rs8   rs9   rs10  rs11  rs12  rs13  rs16 
##  [13] rs17  rs18  rs19  rs20  rs21  rs22  rs23  rs26  rs27  rs28  rs29  rs30 
##  [25] rs31  rs32  rs33  rs34  rs35  rs37  rs38  rs39  rs40  rs42  rs43  rs45 
##  [37] rs47  rs48  rs49  rs50  rs51  rs52  rs53  rs54  rs55  rs56  rs58  rs59 
##  [49] rs60  rs61  rs63  rs64  rs65  rs67  rs69  rs70  rs75  rs76  rs77  rs78 
##  [61] rs80  rs81  rs82  rs83  rs86  rs87  rs88  rs90  rs91  rs92  rs93  rs95 
##  [73] rs96  rs97  rs98  rs100 rs101 rs102 rs103 rs104 rs105 rs107 rs109 rs110
##  [85] rs112 rs113 rs114 rs115 rs116 rs118 rs119 rs120 rs121 rs123 rs125 rs126
##  [97] rs127 rs128 rs129 rs131 rs132 rs133 rs135 rs136 rs139 rs140 rs141 rs142
## [109] rs144 rs150 rs151 rs153 rs155 rs156 rs158 rs160 rs161 rs163 rs170 rs171
## [121] rs172 rs173 rs174 rs175 rs176 rs177 rs178 rs179 rs180 rs181 rs186 rs187
## [133] rs188 rs190 rs192 rs203 rs204 rs205 rs208 rs209 rs211 rs212 rs213 rs214
## [145] rs218 rs220 rs224 rs225 rs226 rs227 rs228 rs229 rs230 rs231 rs232 rs234
## [157] rs235 rs236 rs237 rs241 rs243 rs245 rs246 rs247 rs248 rs250 rs252 rs253
## [169] rs258 rs264 rs266 rs268 rs272 rs273 rs274 rs275 rs276 rs277 rs282 rs283
## [181] rs284 rs285 rs286 rs287 rs288 rs289 rs291 rs292 rs293 rs294 rs297 rs298
## [193] rs299
## 429 Levels: rs1 rs10 rs100 rs101 rs102 rs103 rs104 rs105 rs107 rs109 ... rs98
  • since this is a factor, we must reformat it to numeric, to use it in selection
chr1.snps <- as.character( levels( chr1.snps )[ chr1.snps ] )
  • take only those SNPs from the genotype matrix
raw.child$genotypes <- raw.child$genotypes[ , chr1.snps ]
raw.child$map <- raw.child$map[ chr1.snps , ]
str( raw.child )
## List of 3
##  $ genotypes:Formal class 'SnpMatrix' [package "snpStats"] with 1 slot
##   .. ..@ .Data: raw [1:550, 1:193] 01 01 01 01 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:550] "1_1" "2_1" "3_1" "4_1" ...
##   .. .. .. ..$ : chr [1:193] "rs1" "rs3" "rs5" "rs6" ...
##  $ fam      :'data.frame':   550 obs. of  6 variables:
##   ..$ pedigree: Factor w/ 550 levels "1","10","100",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ member  : Factor w/ 1659 levels "1_1","1_2","1_3",..: 1 338 673 1007 1341 1527 1560 1593 1626 4 ...
##   ..$ father  : Factor w/ 550 levels "1_3","10_3","100_3",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ mother  : Factor w/ 550 levels "1_2","10_2","100_2",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ sex     : num [1:550] 2 1 1 1 2 1 2 2 2 1 ...
##   ..$ affected: num [1:550] 1 0 0 1 1 1 1 0 1 1 ...
##  $ map      :'data.frame':   193 obs. of  5 variables:
##   ..$ chromosome: int [1:193] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ snp.names : Factor w/ 429 levels "rs1","rs10","rs100",..: 1 136 278 341 407 413 421 2 11 20 ...
##   ..$ position  : int [1:193] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ allele.1  : chr [1:193] "G" "T" "A" "C" ...
##   ..$ allele.2  : chr [1:193] "C" "A" "T" "G" ...
  1. Make a new object with first ten SNPs only
raw.child.tmp <- raw.child$genotypes[ , 1:10 ]
str( raw.child.tmp )
## Formal class 'SnpMatrix' [package "snpStats"] with 1 slot
##   ..@ .Data: raw [1:550, 1:10] 01 01 01 01 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:550] "1_1" "2_1" "3_1" "4_1" ...
##   .. .. ..$ : chr [1:10] "rs1" "rs3" "rs5" "rs6" ...
  1. Make a new object with the following SNPs: rs12, rs90, rs93, rs107
head( colnames( raw.child$genotypes ) )
## [1] "rs1" "rs3" "rs5" "rs6" "rs7" "rs8"
raw.child.tmp <- raw.child$genotypes[ ,
  colnames( raw.child$genotypes ) %in% c( "rs12", "rs90", "rs93", "rs107" ) ]
str( raw.child.tmp )
## Formal class 'SnpMatrix' [package "snpStats"] with 1 slot
##   ..@ .Data: raw [1:550, 1:4] 01 02 01 01 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:550] "1_1" "2_1" "3_1" "4_1" ...
##   .. .. ..$ : chr [1:4] "rs12" "rs90" "rs93" "rs107"

Exercise 5

5.1 How many children were affected?

table( raw.child$fam$affected )
## 
##   0   1 
## 279 271

5.2 Create an object raw.mothers with mothers only

mothers <- grep( pattern = "_2", x = raw.all$fam$member )
mothers <- grep( pattern = "_2", x = raw.all$fam$member )
raw.mothers <- list( genotypes = raw.all$genotypes[ mothers, ],
                                         fam = raw.all$fam[ mothers, ],
                                         map = raw.all$map )
str( raw.mothers )
## List of 3
##  $ genotypes:Formal class 'SnpMatrix' [package "snpStats"] with 1 slot
##   .. ..@ .Data: raw [1:550, 1:429] 01 01 01 01 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:550] "1_2" "2_2" "3_2" "4_2" ...
##   .. .. .. ..$ : chr [1:429] "rs1" "rs3" "rs5" "rs6" ...
##  $ fam      :'data.frame':   550 obs. of  6 variables:
##   ..$ pedigree: Factor w/ 550 levels "1","10","100",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ member  : Factor w/ 1659 levels "1_1","1_2","1_3",..: 2 339 674 1008 1342 1528 1561 1594 1627 5 ...
##   ..$ father  : Factor w/ 550 levels "1_3","10_3","100_3",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..$ mother  : Factor w/ 550 levels "1_2","10_2","100_2",..: NA NA NA NA NA NA NA NA NA NA ...
##   ..$ sex     : num [1:550] 2 2 2 2 2 2 2 2 2 2 ...
##   ..$ affected: num [1:550] 0 0 1 0 0 1 1 1 0 1 ...
##  $ map      :'data.frame':   429 obs. of  5 variables:
##   ..$ chromosome: int [1:429] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ snp.names : Factor w/ 429 levels "rs1","rs10","rs100",..: 1 136 278 341 407 413 421 2 11 20 ...
##   ..$ position  : int [1:429] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ allele.1  : chr [1:429] "G" "T" "A" "C" ...
##   ..$ allele.2  : chr [1:429] "C" "A" "T" "G" ...

5.3 From this new matrix, remove the following SNPs: rs12, rs90

raw.mothers.new <- raw.mothers$genotypes[ ,
  !colnames( raw.mothers$genotypes ) %in% c( "rs12", "rs90" ) ]
dim( raw.mothers.new )

Hint: exclamation mark ! gives a negation of a boolean operator

raw.mothers.new <- raw.mothers$genotypes[ ,
  !colnames( raw.mothers$genotypes ) %in% c( "rs12", "rs90" ) ]
dim( raw.mothers.new )
## [1] 550 427

Quality control

  1. Summary of the genetic data, per individual:
qc.child <- row.summary( raw.child.gen )
summary( qc.child )
##    Call.rate      Certain.calls Heterozygosity  
##  Min.   :0.7343   Min.   :1     Min.   :0.1343  
##  1st Qu.:0.9953   1st Qu.:1     1st Qu.:0.2308  
##  Median :0.9977   Median :1     Median :0.2800  
##  Mean   :0.9953   Mean   :1     Mean   :0.2825  
##  3rd Qu.:1.0000   3rd Qu.:1     3rd Qu.:0.3324  
##  Max.   :1.0000   Max.   :1     Max.   :0.4403
plot( qc.child )

  1. We don’t want the individuals with a call rate lower than 0.95:
ok.call.rate <- qc.child$Call.rate >= 0.95
raw.child.ok <- list( genotypes = raw.child$genotypes[ ok.call.rate, ],
                      fam = raw.child$fam[ ok.call.rate, ],
                      map = raw.child$map)
str(raw.child.ok)
## List of 3
##  $ genotypes:Formal class 'SnpMatrix' [package "snpStats"] with 1 slot
##   .. ..@ .Data: raw [1:543, 1:193] 01 01 01 01 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:543] "1_1" "2_1" "3_1" "4_1" ...
##   .. .. .. ..$ : chr [1:193] "rs1" "rs3" "rs5" "rs6" ...
##  $ fam      :'data.frame':   543 obs. of  6 variables:
##   ..$ pedigree: Factor w/ 550 levels "1","10","100",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ member  : Factor w/ 1659 levels "1_1","1_2","1_3",..: 1 338 673 1007 1341 1527 1560 1593 1626 4 ...
##   ..$ father  : Factor w/ 550 levels "1_3","10_3","100_3",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ mother  : Factor w/ 550 levels "1_2","10_2","100_2",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ sex     : num [1:543] 2 1 1 1 2 1 2 2 2 1 ...
##   ..$ affected: num [1:543] 1 0 0 1 1 1 1 0 1 1 ...
##  $ map      :'data.frame':   193 obs. of  5 variables:
##   ..$ chromosome: int [1:193] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ snp.names : Factor w/ 429 levels "rs1","rs10","rs100",..: 1 136 278 341 407 413 421 2 11 20 ...
##   ..$ position  : int [1:193] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ allele.1  : chr [1:193] "G" "T" "A" "C" ...
##   ..$ allele.2  : chr [1:193] "C" "A" "T" "G" ...
  1. Summary of the genetic data, per marker:
qc.child.marker <- col.summary( raw.child.ok$genotypes )
summary( qc.child.marker )
##      Calls         Call.rate      Certain.calls      RAF         
##  Min.   :463.0   Min.   :0.8527   Min.   :1     Min.   :0.01299  
##  1st Qu.:542.0   1st Qu.:0.9982   1st Qu.:1     1st Qu.:0.14180  
##  Median :543.0   Median :1.0000   Median :1     Median :0.36255  
##  Mean   :541.5   Mean   :0.9973   Mean   :1     Mean   :0.38677  
##  3rd Qu.:543.0   3rd Qu.:1.0000   3rd Qu.:1     3rd Qu.:0.57934  
##  Max.   :543.0   Max.   :1.0000   Max.   :1     Max.   :0.93370  
##       MAF               P.AA               P.AB              P.BB        
##  Min.   :0.01299   Min.   :0.003683   Min.   :0.02597   Min.   :0.00000  
##  1st Qu.:0.13076   1st Qu.:0.173112   1st Qu.:0.22284   1st Qu.:0.02399  
##  Median :0.24494   Median :0.416206   Median :0.35175   Median :0.14022  
##  Mean   :0.25545   Mean   :0.447596   Mean   :0.33127   Mean   :0.22113  
##  3rd Qu.:0.38766   3rd Qu.:0.740332   3rd Qu.:0.45672   3rd Qu.:0.35240  
##  Max.   :0.49724   Max.   :0.974026   Max.   :0.52118   Max.   :0.87109  
##      z.HWE        
##  Min.   :-3.3217  
##  1st Qu.:-1.2214  
##  Median :-0.4015  
##  Mean   :-0.4911  
##  3rd Qu.: 0.2179  
##  Max.   : 1.8038
  1. We don’t want the markers with a MAF lower than 0.05:
ok.MAF <- qc.child.marker$MAF >= 0.05
raw.child.ok <- list( genotypes = raw.child.ok$genotypes[ ,ok.MAF ],
                      fam = raw.child.ok$fam,
                      map = raw.child.ok$map[ ok.MAF, ])
str(raw.child.ok)
## List of 3
##  $ genotypes:Formal class 'SnpMatrix' [package "snpStats"] with 1 slot
##   .. ..@ .Data: raw [1:543, 1:179] 02 02 03 02 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:543] "1_1" "2_1" "3_1" "4_1" ...
##   .. .. .. ..$ : chr [1:179] "rs3" "rs5" "rs6" "rs7" ...
##  $ fam      :'data.frame':   543 obs. of  6 variables:
##   ..$ pedigree: Factor w/ 550 levels "1","10","100",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ member  : Factor w/ 1659 levels "1_1","1_2","1_3",..: 1 338 673 1007 1341 1527 1560 1593 1626 4 ...
##   ..$ father  : Factor w/ 550 levels "1_3","10_3","100_3",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ mother  : Factor w/ 550 levels "1_2","10_2","100_2",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ sex     : num [1:543] 2 1 1 1 2 1 2 2 2 1 ...
##   ..$ affected: num [1:543] 1 0 0 1 1 1 1 0 1 1 ...
##  $ map      :'data.frame':   179 obs. of  5 variables:
##   ..$ chromosome: int [1:179] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ snp.names : Factor w/ 429 levels "rs1","rs10","rs100",..: 136 278 341 407 413 421 2 11 20 29 ...
##   ..$ position  : int [1:179] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ allele.1  : chr [1:179] "T" "A" "C" "C" ...
##   ..$ allele.2  : chr [1:179] "A" "T" "G" "G" ...

Exercise 6

6.1 Remove the markers with a call rate lower than 0.99:

qc.child.marker2 <- col.summary( raw.child.ok$genotypes )
ok.markers <- qc.child.marker2$Call.rate >= 0.99
raw.child.ok <- list( genotypes = raw.child.ok$genotypes[ ,ok.markers ],
                                            fam = raw.child.ok$fam,
                      map = raw.child.ok$map[ ok.markers, ])

6.2 How many individuals and markers remain?

str( raw.child.ok )
## List of 3
##  $ genotypes:Formal class 'SnpMatrix' [package "snpStats"] with 1 slot
##   .. ..@ .Data: raw [1:543, 1:172] 02 02 03 02 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:543] "1_1" "2_1" "3_1" "4_1" ...
##   .. .. .. ..$ : chr [1:172] "rs3" "rs5" "rs6" "rs7" ...
##  $ fam      :'data.frame':   543 obs. of  6 variables:
##   ..$ pedigree: Factor w/ 550 levels "1","10","100",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ member  : Factor w/ 1659 levels "1_1","1_2","1_3",..: 1 338 673 1007 1341 1527 1560 1593 1626 4 ...
##   ..$ father  : Factor w/ 550 levels "1_3","10_3","100_3",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ mother  : Factor w/ 550 levels "1_2","10_2","100_2",..: 1 112 223 334 445 507 518 529 540 2 ...
##   ..$ sex     : num [1:543] 2 1 1 1 2 1 2 2 2 1 ...
##   ..$ affected: num [1:543] 1 0 0 1 1 1 1 0 1 1 ...
##  $ map      :'data.frame':   172 obs. of  5 variables:
##   ..$ chromosome: int [1:172] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ snp.names : Factor w/ 429 levels "rs1","rs10","rs100",..: 136 278 341 407 413 421 2 11 29 46 ...
##   ..$ position  : int [1:172] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ allele.1  : chr [1:172] "T" "A" "C" "C" ...
##   ..$ allele.2  : chr [1:172] "A" "T" "G" "G" ...

PART 3

Association analyses in snpStats

  1. Prepare the “clean” data for single.snp.tests().
support.data <- data.frame( cc = raw.child.ok$fam$affected )
rownames( support.data ) <- rownames( raw.child.ok$fam )
head( support.data )
  1. Run the analysis.
gwa.child <- single.snp.tests( phenotype = cc, 
                               data = support.data, 
                               snp.data = raw.child.ok$genotypes )
summary( gwa.child )
##        N         Chi.squared.1.df   Chi.squared.2.df        P.1df        
##  Min.   :539.0   Min.   :0.000057   Min.   : 0.001707   Min.   :0.01581  
##  1st Qu.:542.0   1st Qu.:0.102747   1st Qu.: 0.470300   1st Qu.:0.37568  
##  Median :543.0   Median :0.324869   Median : 1.214830   Median :0.56870  
##  Mean   :542.5   Mean   :0.691520   Mean   : 1.714783   Mean   :0.55022  
##  3rd Qu.:543.0   3rd Qu.:0.784846   3rd Qu.: 2.540864   3rd Qu.:0.74856  
##  Max.   :543.0   Max.   :5.823478   Max.   :12.646170   Max.   :0.99398  
##      P.2df         
##  Min.   :0.001794  
##  1st Qu.:0.280711  
##  Median :0.544757  
##  Mean   :0.537226  
##  3rd Qu.:0.790452  
##  Max.   :0.999147
  1. Make QQ-plot.
chi2 <- chi.squared( gwa.child, df = 1 )
qq.chisq( chi2, df = 1 )

##           N     omitted      lambda 
## 172.0000000   0.0000000   0.8095253
  1. Retrieve p-values.
pval <- p.value( gwa.child, df = 1 )
head( sort( pval ), 20 )
##      rs283      rs176      rs258        rs9      rs227       rs63      rs250 
## 0.01581364 0.03958302 0.05034560 0.06127947 0.07013713 0.07528907 0.07939019 
##        rs3      rs181       rs19      rs247      rs264      rs230       rs32 
## 0.08186758 0.09061798 0.09732142 0.10969309 0.11498103 0.11951170 0.12741026 
##      rs161      rs229      rs126      rs177      rs192       rs55 
## 0.13765377 0.14484085 0.15238453 0.16990444 0.17520211 0.17736579
  1. Find the most signif.SNPs
ord <- order( pval )
top10 <- head( ord, 10 )
top10
##  [1] 160 111 150   6 133  44 148   1 116  13
top10.names <- gwa.child@snp.names[ top10 ]
top10.names
##  [1] "rs283" "rs176" "rs258" "rs9"   "rs227" "rs63"  "rs250" "rs3"   "rs181"
## [10] "rs19"

Exercise 7

7.1 Repeat steps 3 to 5 using the p-values for the test with 2 df. Store the new p-values in pval2.

chi2.2df <- chi.squared( gwa.child, df = 2 )
chi2.2df <- chi.squared( gwa.child, df = 2 )
qq.chisq( chi2.2df, df = 2 )

##           N     omitted      lambda 
## 172.0000000   0.0000000   0.8582369
pval2 <- p.value( gwa.child, df = 2 )
head( sort( pval2 ), 20 )
##        rs38       rs283        rs39        rs55       rs116       rs225 
## 0.001794399 0.040014530 0.041139269 0.051028285 0.059062786 0.060952727 
##       rs170       rs212         rs3       rs247       rs118       rs258 
## 0.073555985 0.079490793 0.083618393 0.087239180 0.093138706 0.098324868 
##        rs11       rs176       rs136       rs211       rs181       rs140 
## 0.108381060 0.118988745 0.134288311 0.135222974 0.143296235 0.148301083 
##       rs208        rs19 
## 0.153220200 0.163201404
ord.2df <- order( pval2 )
top10.2df <- head( ord.2df, 10 )
top10.2df
##  [1]  29 160  30  40  80 131 106 126   1 146
top10.2df.names <- gwa.child@snp.names[ top10.2df ]
top10.2df.names
##  [1] "rs38"  "rs283" "rs39"  "rs55"  "rs116" "rs225" "rs170" "rs212" "rs3"  
## [10] "rs247"

7.2 How many SNPs were in the top 10 using both methods?

top10.2df.names[ top10.2df.names %in% top10.names ]
## [1] "rs283" "rs3"

7.3 Plot pval vs. pval2

plot( pval, pval2 )

7.4 Use abline to draw a straight line through the plot with intercept = 0 and slope = 1. How would you interpret this line?

plot( pval, pval2 )
abline( a = 0, b = 1, col = "red" )